In this project we will extract features from the shapes present in the images in order to correctly classify the EMNIST dataset.
The features to be extracted are:
The classifiers that will be used are:
We will use the classifiers from scikit-learn
import sys
sys.path.append('../../')
from sarpy.datasets import load_emnist
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from skimage import filters
from skimage.morphology import disk, binary_dilation
from skimage.measure import label, regionprops
X_train, y_train, X_test, y_test, X_valid, y_valid, mapping, nb_classes = load_emnist('balanced', validation=True)
X_train = np.squeeze(X_train)
X_valid = np.squeeze(X_valid)
X_test = np.squeeze(X_test)
y_train = np.ravel(y_train)
y_valid = np.ravel(y_valid)
y_test = np.ravel(y_test)
print(np.shape(X_train), np.shape(y_train))
def reduce(data, labels, n_samples, n_class, mapping):
images_class = []
for class_ in mapping:
images_class += [np.where(labels == class_)[0]]
data_red = []
label_red = []
for class_ in images_class:
index_class = np.random.choice(class_, n_samples, replace=False)
data_red += [data[index_class]]
label_red += [labels[class_[0]]]*n_samples
# plt.imshow(np.squeeze(data[class_[0]]), cmap = 'gray')
# plt.show()
data_red = np.array(data_red).reshape((n_samples*n_class, 28, 28))
label_red = np.array(label_red)
return data_red, label_red
n_trains = [300, 500, 700, 900]
n_valids = [100, 200, 300, 400]
n_tests = [100, 200, 300, 400]
X_train_red = []
y_train_red = []
X_valid_red = []
y_valid_red = []
X_test_red = []
y_test_red = []
for i in range(len(n_trains)):
xt, yt = reduce(X_train, y_train, n_trains[i], nb_classes, mapping)
X_train_red += [xt]
y_train_red += [yt]
xte, yte = reduce(X_test, y_test, n_tests[i], nb_classes, mapping)
X_test_red += [xte]
y_test_red += [yte]
xv, yv = reduce(X_valid, y_valid, n_valids[i], nb_classes, mapping)
X_valid_red += [xv]
y_valid_red += [yv]
print(len(X_train_red), len(X_test_red), len(X_valid_red))
def bin_image(imset, thresh = 40):
imsetb = imset > thresh
imsetb = imsetb.astype(int)
return imsetb
X_trainb = []
X_testb = []
X_validb = []
for i in range(len(n_trains)):
X_trainb += [[bin_image(x_t, 50) for x_t in X_train_red[i]]]
X_validb += [[bin_image(x_v, 50) for x_v in X_valid_red[i]]]
X_testb += [[bin_image(x_te, 50) for x_te in X_test_red[i]]]
print(len(X_trainb), len(X_testb), len(X_validb))
for i in range(47):
plt.subplot(1, 6, 1)
plt.imshow(X_train_red[0][i*n_trains[0]], cmap='gray')
plt.title("Train")
plt.subplot(1, 6, 2)
plt.imshow(X_trainb[0][i*n_trains[0]], cmap='gray')
plt.subplot(1, 6, 3)
plt.imshow(X_valid_red[0][i*n_valids[0]], cmap='gray')
plt.title("Validation")
plt.subplot(1, 6, 4)
plt.imshow(X_validb[0][i*n_valids[0]], cmap='gray')
plt.subplot(1, 6, 5)
plt.imshow(X_test_red[0][i*n_tests[0]], cmap='gray')
plt.title("Test")
plt.subplot(1, 6, 6)
plt.imshow(X_testb[0][i*n_tests[0]], cmap='gray')
print(y_train_red[0][i*300])
plt.show()
def bin_otsu(images):
new_images = []
for image in images:
t = filters.threshold_otsu(image)
new_images += [(image>t).astype(int)]
return np.array(new_images)
X_train_otsu = [bin_otsu(xt) for xt in X_train_red]
X_valid_otsu = [bin_otsu(xv) for xv in X_valid_red]
X_test_otsu = [bin_otsu(xte) for xte in X_test_red]
print(len(X_train_otsu), len(X_valid_otsu), len(X_test_otsu), np.shape(X_train_otsu[0]))
for i in range(47):
plt.subplot(1, 3, 1)
plt.imshow(X_train_red[0][i*n_trains[0]], cmap='gray')
plt.subplot(1, 3, 2)
plt.imshow(X_trainb[0][i*n_trains[0]], cmap='gray')
plt.subplot(1, 3, 3)
plt.imshow(X_train_otsu[0][i*n_trains[0]], cmap='gray')
plt.show()
def connect(imgs):
disk3 = disk(2)
new_imgs = []
coefs = []
for img, i in zip(imgs, range(len(imgs))):
while np.max(label(img)) > 2:
img = binary_dilation(img, disk3)
coefs+= [i]
new_imgs += [img]
return new_imgs, np.unique(coefs)
def hist_components(imgs, text):
new_imgs = [label(img) for img in imgs]
comps = [np.max(new_img) for new_img in new_imgs]
plt.hist(comps)
plt.title(text)
plt.show()
for i in range(len(n_trains)):
hist_components(X_train_otsu[i], "Number of connected components per image before filtering - Training set")
hist_components(X_test_otsu[i], "Number of connected components per image before filtering - Test set")
hist_components(X_valid_otsu[i], "Number of connected components per image before filtering- Validation set")
def remove_noise(imgs, thresh):
new_imgs = []
coefs = []
for img, i in zip(imgs, range(len(imgs))):
imgl = label(img)
new_img = np.copy(img)
if np.max(imgl) > 1:
props = regionprops(imgl)
areas = [props[i].area for i in range(np.max(imgl))]
for area, j in zip(areas, range(len(areas))):
if area < thresh:
coefs +=[i]
a, b, c, d = props[j].bbox
new_img[a:c, b:d] = 0
new_imgs += [new_img]
return new_imgs, coefs
X_train_otsu_clean = []
X_valid_otsu_clean = []
X_test_otsu_clean = []
coefstrain = []
coefsvalid = []
coefstest = []
for i in range(len(X_train_otsu)):
xt, coefs = remove_noise(X_train_otsu[i], 10)
X_train_otsu_clean += [xt]
coefstrain += [coefs]
xte, coefst = remove_noise(X_test_otsu[i], 10)
X_test_otsu_clean += [xte]
coefstest += [coefst]
xv, coefsv = remove_noise(X_valid_otsu[i], 10)
X_valid_otsu_clean += [xv]
coefsvalid += [coefsv]
print(len(X_train_otsu_clean), len(X_valid_otsu_clean), len(X_test_otsu_clean), np.shape(X_train_otsu_clean[0]))
for i in coefstrain[0]:
plt.subplot(1, 2, 1)
plt.imshow(X_train_otsu[0][i], cmap = 'gray')
plt.title("Train - Original")
plt.subplot(1, 2, 2)
plt.imshow(X_train_otsu_clean[0][i], cmap = 'gray')
plt.title("Train - Filtered")
plt.show()
for i in coefstest[2]:
plt.subplot(1, 2, 1)
plt.imshow(X_test_otsu[2][i], cmap = 'gray')
plt.title("Test - Original")
plt.subplot(1, 2, 2)
plt.imshow(X_test_otsu_clean[2][i], cmap = 'gray')
plt.title("Test - Filtered")
plt.show()
for i in range(len(X_train_otsu)):
hist_components(X_train_otsu_clean[i], "Number of connected components per image after filtering- Training set")
hist_components(X_test_otsu_clean[i], "Number of connected components per image after filtering - Test set")
hist_components(X_valid_otsu_clean[i], "Number of connected components per image after filtering - Validation set")
from skimage.measure import regionprops, label
from scipy import ndimage
The features extracted from the shape are, as previously cited:
def feature_extraction(image_bin, image_gray):
features = []
img_binl = label(image_bin)
props = regionprops(label_image=img_binl, intensity_image=image_gray)
max_comp = 0
if np.max(img_binl) > 1:
areas = [props[i].area for i in range(np.max(img_binl))]
max_comp = np.argmax(areas)
features.append(props[max_comp].area)
features.append(props[max_comp].eccentricity)
features.append(props[max_comp].equivalent_diameter)
features.append(props[max_comp].euler_number)
features.append(props[max_comp].extent)
lcx, lcy = props[max_comp].centroid
features.append(lcx)
features.append(lcy)
features.append(props[max_comp].mean_intensity)
features.append(props[max_comp].major_axis_length)
features.append(props[max_comp].minor_axis_length)
features.append(props[max_comp].major_axis_length/props[max_comp].minor_axis_length)
features.append(props[max_comp].orientation)
features.append(props[max_comp].perimeter)
features.append(props[max_comp].solidity)
features.append(props[max_comp].perimeter/props[max_comp].area)
features.append(ndimage.standard_deviation(image_gray))
return features
f_trains = []
f_valids = []
f_tests = []
for i in range(len(X_train_otsu_clean)):
f_trains += [[feature_extraction(imageb, np.squeeze(imageg)) for imageb, imageg in zip(X_train_otsu_clean[i], X_train_red[i])]]
print("Created train feature vector")
f_tests += [[feature_extraction(imageb, np.squeeze(imageg)) for imageb, imageg in zip(X_test_otsu_clean[i], X_test_red[i])]]
print("Created test feature vector")
f_valids += [[feature_extraction(imageb, np.squeeze(imageg)) for imageb, imageg in zip(X_valid_otsu_clean[i], X_valid_red[i])]]
print("Created valid feature vector")
print(len(f_trains), len(f_valids), len(f_tests), len(f_trains[0]))
def sep_dig(data, labels, n_samples):
data_dig = data[:10*n_samples]
labels_dig = labels[:10*n_samples]
data_let = data[10*n_samples:]
labels_let = labels[10*n_samples:]
return data_dig, labels_dig, data_let, labels_let
f_trains_dig, y_trains_dig, f_trains_let, y_trains_let = [] , [], [], []
f_valids_dig, y_valids_dig, f_valids_let, y_valids_let = [] , [], [], []
f_tests_dig, y_tests_dig, f_tests_let, y_tests_let = [] , [], [], []
for i in range(len(f_trains)):
f_train_dig, y_train_dig, f_train_let, y_train_let = sep_dig(f_trains[i], y_train_red[i], n_trains[i])
f_trains_dig += [f_train_dig]
y_trains_dig += [y_train_dig]
f_trains_let += [f_train_let]
y_trains_let += [y_train_let]
f_test_dig, y_test_dig, f_test_let, y_test_let = sep_dig(f_tests[i], y_test_red[i], n_tests[i])
f_tests_dig += [f_test_dig]
y_tests_dig += [y_test_dig]
f_tests_let += [f_test_let]
y_tests_let += [y_test_let]
f_valid_dig, y_valid_dig, f_valid_let, y_valid_let = sep_dig(f_valids[i], y_valid_red[i], n_valids[i])
f_valids_dig += [f_valid_dig]
y_valids_dig += [y_valid_dig]
f_valids_let += [f_valid_let]
y_valids_let += [y_valid_let]
print(y_tests_dig[0].shape)
#TODO: fix it
def permute(data, labels, n_samples, nb_classes):
perm = np.random.permutation(np.arange(n_samples*nb_classes))
new_data = np.array(data, copy = True)[perm]
new_labels = np.array(labels, copy = True)[perm]
return new_data, new_labels
f_train_perm, y_train_perm, f_test_perm, y_test_perm, f_valid_perm, y_valid_perm = [], [], [], [], [], []
for i in range(len(n_trains)):
tp, ytp = permute(f_trains[i], y_train_red[i], n_trains[i], nb_classes)
f_train_perm += [tp]
y_train_perm += [ytp]
tep, ytep = permute(f_tests[i], y_test_red[i], n_tests[i], nb_classes)
f_test_perm += [tep]
y_test_perm += [ytep]
vp, yvp = permute(f_valids[i], y_valid_red[i], n_valids[i], nb_classes)
f_valid_perm += [vp]
y_valid_perm += [yvp]
print(y_test_perm[0][2010])
# plt.imshow()
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import time
def feature_sel(train, test, valid):
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
train_sel = sel.fit_transform(train)
test_sel = sel.fit_transform(test)
valid_sel = sel.fit_transform(valid)
print(train_sel.shape, test_sel.shape, valid_sel.shape)
return train_sel, test_sel, valid_sel
def normalize(train, test, valid):
stdScaler = StandardScaler()
train_norm = stdScaler.fit_transform(train)
test_norm = stdScaler.fit_transform(test)
valid_norm = stdScaler.fit_transform(valid)
return train_norm, test_norm, valid_norm
def transform(trains, tests, valids):
train_norm, test_norm, valid_norm = [], [], []
for i in range(len(trains)):
train, test, valid = normalize(*feature_sel(trains[i], tests[i], valids[i]))
train_norm += [train]
test_norm += [test]
valid_norm += [valid]
print(len(train_norm), len(test_norm), len(valid_norm))
return train_norm, test_norm, valid_norm
def pca(trains, tests, valids, n_comp=4):
train_pca, test_pca, valid_pca = [], [], []
for i in range(len(trains)):
train_pca += [PCA(n_components=n_comp).fit_transform(trains[i])]
test_pca += [PCA(n_components=n_comp).fit_transform(tests[i])]
valid_pca += [PCA(n_components=n_comp).fit_transform(valids[i])]
return train_pca, test_pca, valid_pca
The classifiers that we used in this project were:
SVMs are supervised learning models used for data classification. Given some data points as p-dimensional vectors, it decides which class a new data point will be in. The goal is to find a (p-1)dimensional hyperplane that can separate the data points in classes. We choose the hyperplane that maximizes the distance from it to the nearest data point on each side.
clfs = [SVC(kernel='linear', C = 0.01), GradientBoostingClassifier()]
clfs_dig = [SVC(kernel='linear', C = 0.01), GradientBoostingClassifier()]
clfs_let = [SVC(kernel='linear', C = 0.01), GradientBoostingClassifier()]
clfs_pca = [SVC(kernel='linear', C = 0.01), GradientBoostingClassifier()]
clfs_dig_pca = [SVC(kernel='linear', C = 0.01), GradientBoostingClassifier()]
clfs_let_pca = [SVC(kernel='linear', C = 0.01), GradientBoostingClassifier()]
clfs_names = ["SVM with Linear Kernel", "Gradient Boosting"]
train_norm, test_norm, valid_norm = transform(f_trains, f_tests, f_valids)
train_digs_norm, test_digs_norm, valid_digs_norm = transform(f_trains_dig, f_tests_dig, f_valids_dig)
train_lets_norm, test_lets_norm, valid_lets_norm = transform(f_trains_let, f_tests_let, f_valids_let)
def classify(trains, trains_label, tests, tests_label, valids, valids_label, clfs, clfs_names):
score = dict()
valid_e = dict()
times = dict()
classifier = dict()
print(tests_label[0].shape)
for cla, i in zip(clfs, clfs_names):
if not i in score:
score[i], valid_e[i], times[i], classifier[i] = [], [], [], []
for j in range(len(trains)):
clf = cla
start = time.time()
classifier[i] += [clf.fit(trains[j], trains_label[j])]
end = time.time()
score[i] += [[clf.score(tests[k], tests_label[k]) for k in range(len(tests))]]
valid_e[i] += [[clf.score(valids[k], valids_label[k]) for k in range(len(valids))]]
times[i] += [end-start]
return classifier, score, valid_e, times
classifiers, scores, valid_errs, times = classify(train_norm, y_train_red, test_norm, y_test_red, valid_norm, y_valid_red, clfs, clfs_names)
classfiers_dig, scores_dig, valid_errs_dig, times_dig = classify(train_digs_norm, y_trains_dig, test_digs_norm, y_tests_dig, valid_digs_norm, y_valids_dig, clfs_dig, clfs_names)
classfiers_let, scores_let, valid_errs_let, times_let = classify(train_lets_norm, y_trains_let, test_lets_norm, y_tests_let, valid_lets_norm, y_valids_let, clfs_let, clfs_names)
train_pca, test_pca, valid_pca = pca(f_trains, f_tests, f_valids, 5)
train_digs_pca, test_digs_pca, valid_digs_pca = pca(f_trains_dig, f_tests_dig, f_valids_dig)
train_lets_pca, test_lets_pca, valid_lets_pca = pca(f_trains_let, f_tests_let, f_valids_let)
classifiers_pca, scores_pca, valid_errs_pca, times_pca = classify(train_pca, y_train_red, test_pca, y_test_red, valid_pca, y_valid_red, clfs_pca, clfs_names)
classfiers_dig_pca, scores_dig_pca, valid_errs_dig_pca, times_dig_pca = classify(train_digs_pca, y_trains_dig, test_digs_pca, y_tests_dig, valid_digs_pca, y_valids_dig, clfs_dig_pca, clfs_names)
classfiers_let_pca, scores_let_pca, valid_errs_let_pca, times_let_pca = classify(train_lets_pca, y_trains_let, test_lets_pca, y_tests_let, valid_lets_pca, y_valids_let, clfs_let_pca, clfs_names)
from sklearn import metrics
import pprint, binascii
import seaborn as sns
characters = [binascii.b2a_qp(mapping[i]).decode('ascii') for i in mapping]
def heatmap(test, label, classifier, characters, title):
fig, ax = plt.subplots(figsize=(12,12))
pred = classifier.predict(test)
sns.heatmap(metrics.confusion_matrix(label, pred), xticklabels=characters, yticklabels=characters)
plt.title(title)
plt.show()
def accuracy_graph(clf_names, scores, n_trains, n_tests, name):
for clf_name in clfs_names:
fig, ax = plt.subplots(figsize=(30, 10))
for scr, size, i in zip(scores[clf_name], n_trains, range(len(n_trains))):
plt.subplot(1, len(n_trains), i+1)
plt.plot(scr)
plt.title("Accuracy in " + name + " set" + clf_name)
plt.xticks(np.arange(4), (n_tests))
plt.ylabel("Accuracy - training set with " + str(size)+ " samples per class")
plt.show()
def time_graph(clfs_names, times, n_trains):
for clf in clfs_names:
fig, ax = plt.subplots(figsize=(5,4))
plt.plot(times[clf])
plt.xticks(np.arange(4), (n_trains))
plt.title("Execution time for " + clf + " per training set size (s)")
plt.ylabel("seconds")
plt.xlabel("Training set size")
plt.show()
heatmap(test_norm[1], y_test_red[1], classifiers[clfs_names[0]][0], characters, title='Heatmap of confusion matrix - SVM')
heatmap(test_norm[1], y_test_red[1], classifiers[clfs_names[1]][0], characters, title='Heatmap of confusion matrix - Gradient Boosting')
accuracy_graph(clfs_names, scores, n_trains, n_tests, "test")
accuracy_graph(clfs_names, valid_errs, n_trains, n_valids, "validation")
time_graph(clfs_names, times, n_trains)
heatmap(test_digs_norm[1], y_tests_dig[1], classfiers_dig[clfs_names[0]][0], characters[:10], title='Heatmap of confusion matrix - SVM')
heatmap(test_digs_norm[1], y_tests_dig[1], classfiers_dig[clfs_names[1]][0], characters[:10], title='Heatmap of confusion matrix - Gradient Boosting')
accuracy_graph(clfs_names, scores_dig, n_trains, n_tests, "test")
accuracy_graph(clfs_names, valid_errs_dig, n_trains, n_valids, "validation")
time_graph(clfs_names, times_dig, n_trains)
heatmap(test_lets_norm[1], y_tests_let[1], classfiers_let[clfs_names[0]][0], characters[10:], title='Heatmap of confusion matrix - SVM')
heatmap(test_lets_norm[1], y_tests_let[1], classfiers_let[clfs_names[1]][0], characters[10:], title='Heatmap of confusion matrix - Gradient Boosting')
accuracy_graph(clfs_names, scores_let, n_trains, n_tests, "test")
accuracy_graph(clfs_names, valid_errs_let, n_trains, n_valids, "validation")
time_graph(clfs_names, times_let, n_trains)
heatmap(test_pca[1], y_test_red[1], classifiers_pca[clfs_names[0]][0], characters, title='Heatmap of confusion matrix - SVM')
heatmap(test_pca[1], y_test_red[1], classifiers_pca[clfs_names[1]][0], characters, title='Heatmap of confusion matrix - Gradient Boosting')
accuracy_graph(clfs_names, scores_pca, n_trains, n_tests, "test")
accuracy_graph(clfs_names, valid_errs_pca, n_trains, n_valids, "validation")
time_graph(clfs_names, times_pca, n_trains)
heatmap(test_digs_pca[1], y_tests_dig[1], classfiers_dig_pca[clfs_names[0]][0], characters[:10], title='Heatmap of confusion matrix - SVM')
heatmap(test_digs_pca[1], y_tests_dig[1], classfiers_dig_pca[clfs_names[1]][0], characters[:10], title='Heatmap of confusion matrix - Gradient Boosting')
accuracy_graph(clfs_names, scores_dig_pca, n_trains, n_tests, "test")
accuracy_graph(clfs_names, valid_errs_dig_pca, n_trains, n_valids, "validation")
time_graph(clfs_names, times_dig_pca, n_trains)
heatmap(test_lets_pca[1], y_tests_let[1], classfiers_let_pca[clfs_names[0]][0], characters[10:], title='Heatmap of confusion matrix - SVM')
heatmap(test_lets_pca[1], y_tests_let[1], classfiers_let_pca[clfs_names[1]][0], characters[10:], title='Heatmap of confusion matrix - Gradient Boosting')
accuracy_graph(clfs_names, scores_let_pca, n_trains, n_tests, "test")
accuracy_graph(clfs_names, valid_errs_let_pca, n_trains, n_valids, "validation")
time_graph(clfs_names, times_let_pca, n_trains)